In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
In [2]:
from helpers import Timer
In [3]:
from sklearn.datasets import load_files
reviews_train = load_files("aclImdb/train/")
text_train, y_train = reviews_train.data, reviews_train.target
In [4]:
print("Number of documents in training data: %d" % len(text_train))
print(np.bincount(y_train))
In [5]:
reviews_test = load_files("aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data: %d" % len(text_test))
print(np.bincount(y_test))
In [6]:
print(text_train[1])
In [7]:
print(y_train[1])
In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(text_train)
len(cv.vocabulary_)
Out[8]:
In [9]:
print(cv.get_feature_names()[:50])
print(cv.get_feature_names()[50000:50050])
In [10]:
X_train = cv.transform(text_train)
X_train
Out[10]:
In [11]:
print(text_train[19726])
In [12]:
X_train[19726].nonzero()[1]
Out[12]:
In [13]:
X_test = cv.transform(text_test)
In [14]:
from sklearn.svm import LinearSVC
svm = LinearSVC()
with Timer():
svm.fit(X_train, y_train)
In [15]:
svm.score(X_train, y_train)
Out[15]:
In [16]:
svm.score(X_test, y_test)
Out[16]:
In [17]:
def visualize_coefficients(classifier, feature_names, n_top_features=25):
# get coefficients with large absolute values
coef = classifier.coef_.ravel()
positive_coefficients = np.argsort(coef)[-n_top_features:]
negative_coefficients = np.argsort(coef)[:n_top_features]
interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])
# plot them
plt.figure(figsize=(15, 5))
colors = ["red" if c < 0 else "blue" for c in coef[interesting_coefficients]]
plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients], color=colors)
feature_names = np.array(feature_names)
plt.xticks(np.arange(1, 1 + 2 * n_top_features), feature_names[interesting_coefficients], rotation=60, ha="right");
In [18]:
visualize_coefficients(svm, cv.get_feature_names())
In [19]:
from sklearn.pipeline import make_pipeline
text_pipe = make_pipeline(CountVectorizer(), LinearSVC())
with Timer():
text_pipe.fit(text_train, y_train)
text_pipe.score(text_test, y_test)
Out[19]:
In [20]:
from sklearn.grid_search import GridSearchCV
param_grid = {'linearsvc__C': np.logspace(-5, 0, 6)}
grid = GridSearchCV(text_pipe, param_grid, cv=5)
with Timer():
grid.fit(text_train, y_train);
In [21]:
from figures import plot_grid_1d
plot_grid_1d(grid)
grid.best_params_
Out[21]:
In [22]:
visualize_coefficients(grid.best_estimator_.named_steps['linearsvc'],
grid.best_estimator_.named_steps['countvectorizer'].get_feature_names())
In [23]:
grid.best_score_
Out[23]:
In [24]:
grid.score(text_test, y_test)
Out[24]:
In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_pipe = make_pipeline(TfidfVectorizer(), LinearSVC())
param_grid = {'linearsvc__C': np.logspace(-3, 2, 6)}
grid = GridSearchCV(tfidf_pipe, param_grid, cv=5)
with Timer():
grid.fit(text_train, y_train)
plot_grid_1d(grid)
Out[27]:
In [28]:
visualize_coefficients(grid.best_estimator_.named_steps['linearsvc'],
grid.best_estimator_.named_steps['tfidfvectorizer'].get_feature_names())
In [29]:
grid.best_score_
Out[29]:
In [30]:
grid.score(text_test, y_test)
Out[30]:
In [32]:
text_pipe = make_pipeline(CountVectorizer(), LinearSVC())
param_grid = {'linearsvc__C': np.logspace(-3, 2, 6),
"countvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)]}
grid = GridSearchCV(text_pipe, param_grid, cv=5)
with Timer():
grid.fit(text_train, y_train)
In [33]:
scores = np.array([score.mean_validation_score for score in grid.grid_scores_]).reshape(3, -1)
plt.matshow(scores)
plt.ylabel("n-gram range")
plt.yticks(range(3), param_grid["countvectorizer__ngram_range"])
plt.xlabel("C")
plt.xticks(range(6), param_grid["linearsvc__C"]);
plt.colorbar()
Out[33]:
In [34]:
grid.best_params_
Out[34]:
In [35]:
visualize_coefficients(grid.best_estimator_.named_steps['linearsvc'],
grid.best_estimator_.named_steps['countvectorizer'].get_feature_names())
In [36]:
grid.score(text_test, y_test)
Out[36]:
In [ ]: